Prepare

library(tidyverse, warn.conflicts = F)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(purrr) # for functional programming
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))

Get country information

grouped <- df %>%
  filter(!is.na(country) & country != "") %>%
  mutate(country = strsplit(as.character(country), ",")) %>%
  mutate(country = lapply(country, trimws)) %>%
  unnest(country) %>%
  group_by(country, release_year, type) %>%
  summarise(cnt = n()) %>%
  filter(!is.na(country) & country != "")
## `summarise()` has grouped output by 'country', 'release_year'. You can override
## using the `.groups` argument.

Grouping

by_country_type <- grouped %>%
  group_by(country, type) %>%
  summarise(cnt = sum(cnt))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.

Movie Distribution by country

plot_ly(by_country_type %>% filter(type == "Movie"),
  type = "choropleth",
  locations = ~country,
  locationmode = "country names",
  z = ~cnt,
  color = ~cnt,
  colors = "OrRd",
  colorbar = list(title = "Counts"),
  text = ~ paste(country, "<br>Counts: ", cnt)
)

TV Show Distribution by contry

plot_ly(by_country_type %>% filter(type == "TV Show"),
  type = "choropleth",
  locations = ~country,
  locationmode = "country names",
  z = ~cnt,
  color = ~cnt,
  colors = "OrRd",
  colorbar = list(title = "Counts"),
  text = ~ paste(country, "<br>Counts: ", cnt)
)

Gathering data

type_prop <- by_country_type %>%
  group_by(country) %>%
  mutate(prop = round(cnt / sum(cnt) * 100, 1)) %>%
  group_by(country) %>%
  summarise(total = sum(cnt), prop = prop, type = type) %>%
  as.data.frame() %>%
  top_n(20, wt = total)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
custom_order <- type_prop %>% 
  arrange(desc(ifelse(type == "Movie", prop, -prop))) %>%
  select(country) %>%
  array() %>%
  flatten() %>%
  unique()
ggplot(type_prop, aes(y = factor(country, levels = custom_order), x = prop, fill = type)) +
  geom_bar(stat = "identity") +
  labs(title = "Proportions of Movie and TV Show by Country",
       y = "Proportion (%)",
       x = "Country",
       fill = "Type") +
  scale_x_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 100)) +
  scale_fill_manual(values = c("#221f1f", "#b20710")) +
  theme_minimal()